# Bootstrapped models #
# This script will run the bootstrapped models, and save the results in Tabular format
# It will also save output necessary to compute the plotted mean, divergence, and sorting values used to prepare the figures

# Load packages and install them if necessary (automatic)
## Note! We also use the plyr package to merge data. If you load plyr, it will overrule some dplyr functions, like mutate, which is why we explicitly call plyr::rbind.fill() 
## You will need to have the plyr package installed (but not necessarily loaded)
## If not, run #install.packages('plyr')
need = c(
  "dplyr",           #data structuring/cleaning
  "estimatr",        #run clustered lm models in source code
  "here",            #setting wd
  "janitor",         #data structuring/cleaning
  "sandwich",        #robust se's
  "scales",          #data structuring/cleaning
  "stringr",         #data structuring/cleaning
  "tidyr"            #data structuring/cleaning
)     
have = need %in% rownames(installed.packages())
if(any(!have)) {install.packages(need[!have])}
pack <- lapply(need, library, character.only = TRUE)
# Clean environment
rm(have, need, pack)

# Check wd
here::here()

# Start with setting a seed for replication purposes (just in case, also set in function)
set.seed(28)

# Optional but handy (turn off scientific notations)
options(scipen = 999) 

# Load functions for bootstraps and computing output
source("D:/dv_rep/Source/boot_functions.R")
# Load data (includes immigration ANES data)
source("D:/dv_rep/Source/dat_gss.R")

#### Bootstrapped models ####

# Set N bootstraps to 1000
n_bootstraps <- 1000

# Bootstraps without controls (note: together, these bootstraps will take a considerable amount of time, up to several hours!)
boot_Polviews  <- perform_bootstrap(gss_Polviews, n_bootstraps, include_controls = FALSE)
boot_Swelfare  <- perform_bootstrap(gss_Swelfare, n_bootstraps, include_controls = FALSE)
boot_Racgender <- perform_bootstrap(gss_Racgender, n_bootstraps, include_controls = FALSE)
boot_Cultmoral <- perform_bootstrap(gss_Cultmoral, n_bootstraps, include_controls = FALSE)
boot_Imm       <- perform_bootstrap(anes_Immig, n_bootstraps, include_controls = FALSE)

# Bootstraps with controls
cboot_Polviews  <- perform_bootstrap(gss_Polviews, n_bootstraps, include_controls = TRUE)
cboot_Swelfare  <- perform_bootstrap(gss_Swelfare, n_bootstraps, include_controls = TRUE)
cboot_Racgender <- perform_bootstrap(gss_Racgender, n_bootstraps, include_controls = TRUE)
cboot_Cultmoral <- perform_bootstrap(gss_Cultmoral, n_bootstraps, include_controls = TRUE)
cboot_Imm       <- perform_bootstrap(anes_Immig, n_bootstraps, include_controls = TRUE)

# Not run: Run lm_robust models 
# This runs all models as lm models with robust standard errors using estimatr, clustered by period (year)
# The main difference concerns the standard errors for the period estimates, which are smaller for the period effects in the clustered lm_robust models
#source("D:/POBE/Source/lmrobust_gss.R")

# Feel free to view these results, for example:
#summary(gss_meanPolviews) # mean model GSS ideological self-placement (without controls)
#summary(sort_gssPolviews) # sorting model GSS ideological self-placement (without controls)
# note that the summary output is a little different from a regular lm object
# see source code for more details

# A simple example of a plotted sorting model (note: the reference level is plotted first. results differ slightly mainly in the SEs - example shows the greatest difference in comparison to the boot (period effects))
# You'll need to have the marginaleffects package installed
#marginaleffects::plot_slopes(sort_gssPolviews, draw = TRUE, variables = "sort_iv", by = "year", newdata = "mean") + ggplot2::theme_minimal()

# Extract parameters bootstrapped models without controls
param_Polviews  <- extract(
  boot_Polviews, 
  data_frame  = gss_Polviews
)

param_Swelfare <- extract(
  boot_Swelfare, 
  data_frame = gss_Swelfare
)

param_Cmoral <- extract(
  boot_Cultmoral, 
  data_frame = gss_Cultmoral
)

param_Rgender <- extract(
  boot_Racgender, 
  data_frame = gss_Racgender
)

param_Imm <- extract(
  boot_Imm, 
  data_frame = anes_Immig
)

# Models with control variables
cparam_Polviews <- extract(
  cboot_Polviews, 
  data_frame = gss_Polviews
)

cparam_Swelfare <- extract(
  cboot_Swelfare, 
  data_frame = gss_Swelfare
)

cparam_Cmoral <- extract(
  cboot_Cultmoral, 
  data_frame = gss_Cultmoral
)

cparam_Rgender <- extract(
  cboot_Racgender, 
  data_frame = gss_Racgender
)

cparam_Imm <- extract(
  cboot_Imm, 
  data_frame = anes_Immig
)

# Make data frames for each model + add column to indicate the variable
tab_Polviews     <- make_Tab(param_Polviews)
tab_Polviews$var <- "Ideological self-placement"
tab_Swelfare     <- make_Tab(param_Swelfare)
tab_Swelfare$var <- "Social welfare"
tab_Rgender      <- make_Tab(param_Rgender)
tab_Rgender$var  <- "Race and gender"
tab_Cmoral       <- make_Tab(param_Cmoral)
tab_Cmoral$var   <- "Culture and morality"
tab_Imm          <- make_Tab(param_Imm)
tab_Imm$var      <- "Immigration"

# Repeat for models with control variables
ctab_Polviews     <- make_Tab(cparam_Polviews)
ctab_Polviews$var <- "Ideological self-placement"
ctab_Swelfare     <- make_Tab(cparam_Swelfare)
ctab_Swelfare$var <- "Social welfare"
ctab_Rgender      <- make_Tab(cparam_Rgender)
ctab_Rgender$var  <- "Race and gender"
ctab_Cmoral       <- make_Tab(cparam_Cmoral)
ctab_Cmoral$var   <- "Culture and morality"
ctab_Imm          <- make_Tab(cparam_Imm)
ctab_Imm$var      <- "Immigration"

# Combine to a single data frame
gss_Tab <- plyr::rbind.fill(
  tab_Polviews,
  tab_Swelfare,
  tab_Rgender,
  tab_Cmoral,
  tab_Imm
) 

# Add column to specify that these models are indeed lacking additional control variables
gss_Tab$controls <- paste("Without")

# Models with controls
cgss_Tab <- plyr::rbind.fill(
  ctab_Polviews,
  ctab_Swelfare,
  ctab_Rgender,
  ctab_Cmoral,
  ctab_Imm
) 

# Repeat
cgss_Tab$controls <- paste("With")

# Combine to single data frame (please note: you need to have the plyr package installed - but we do not load it explicitly to avoid errors with dplyr)
gss_Boot <- plyr::rbind.fill(
  gss_Tab,
  cgss_Tab
)

# Data structuring: add columns for indexing
gss_Boot <- gss_Boot %>%
  mutate(term = gsub(":_C", "", term),
         term = case_when(
           term == "_C" ~ "Variable",
           term == ":white" ~ ":White",
           term == ":college" ~ ":College", 
           term == ":protcattend" ~ ":Protestant",
           TRUE ~ term
         )) %>%
  mutate(group = case_when(
    startsWith(term, "Cohort") ~ "Cohort",
    startsWith(term, "Age") ~ "Age",
    startsWith(term, "Year") ~ "Year",
    startsWith(term, ":Cohort") ~ "Cohort",
    startsWith(term, ":Age") ~ "Age",
    startsWith(term, ":Year") ~ "Year",
    startsWith(term, ":_C:Cohort") ~ "Cohort",
    startsWith(term, ":_C:Age") ~ "Age",
    startsWith(term, ":_C:Year") ~ "Year",
    startsWith(term, "White") ~ "White",
    startsWith(term, "College") ~ "College",
    startsWith(term, "Year") ~ "Protestant",
    startsWith(term, ":White") ~ "White",
    startsWith(term, ":College") ~ "College",
    startsWith(term, ":Protestant") ~ "Protestant",
    TRUE ~ term
  )) %>%
  mutate(boot = case_when(
    startsWith(model, "Boot") ~ "Boot",
    TRUE ~ "OLS"
  )) %>%
  mutate(term = gsub("Cohort |Age |Year ", "", term),
         facet = gsub("Boot |Original ", "", model)) 

# Add proportion of respondents for each group to data frame (we'll need this to compute values for plotting)
# This proportion column is called Freq (frequency respondents)
prop_polviews  <- prop(gss_Polviews, "Ideological self-placement", gen = "genPew")
prop_swelfare  <- prop(gss_Swelfare, "Social welfare", gen = "genPew")
prop_rgender   <- prop(gss_Racgender, "Race and gender", gen = "genPew")
prop_cultmoral <- prop(gss_Cultmoral, "Culture and morality", gen = "genPew")
prop_immig     <- prop(anes_Immig, "Immigration", gen = "genPew")

# Merge
prop_merge     <- plyr::rbind.fill(
  prop_polviews,
  prop_swelfare,
  prop_rgender,
  prop_cultmoral,
  prop_immig
)

# Combine
gss_Boot <- gss_Boot %>% left_join(prop_merge, by = c("term", "var"))

# Save for future use (i.e., plotting)
save(gss_Boot, file = "C:/Users/tjocker/desktop/POBE/Datasets/gss_Boot.Rdata")

# Additional data structuring for p-values
# Warning refers to the fact that we don't have a standard error for R2 for the OLS models, which is true
gss_Tab <- gss_Boot %>% 
  mutate(
    p_star = case_when(
      p < 0.001 ~ "***",
      p < 0.01  ~ "**",
      p < 0.05  ~ "*",
      TRUE ~ ""
    )) %>%
  mutate(across(c(coef, se), ~ as.numeric(.))) %>%
  mutate(across(c(r2, r2_dev, adj_r2), ~ as.numeric(format_fixed_decimals(., decimals = 3)))) %>%
  mutate(coef_p  = paste0(format_fixed_decimals(coef, 2), " (", format_fixed_decimals(se, 2), ")", p_star)
  )

# Convert to wide format
tab_Wide <- gss_Tab %>%
  select(term, controls, coef_p, facet, boot, var) %>%
  pivot_wider(., names_from = var, values_from = coef_p) %>%
  janitor::clean_names() %>%
  select(
    term,
    facet,
    boot,
    controls,
    ideological_self_placement,
    social_welfare,
    race_and_gender, 
    culture_and_morality,
    immigration,
  ) %>%
  as.data.frame()

# Extract more parameters of interest (R2 and N, mainly)
rows_Tab <- gss_Tab %>%
  select(controls, facet, boot, var, r2, r2_dev, adj_r2, N) %>%
  distinct %>%
  pivot_longer(cols = c(r2, r2_dev, adj_r2, N), names_to = "term", values_to = "coef_p") %>%
  mutate(coef_p = ifelse(term %in% c("r2", "adj_2", "r2_dev"), format_fixed_decimals(coef_p, 3), coef_p)) %>%
  mutate(coef_p = as.character(coef_p)) %>%
  mutate(coef_p = case_when(term == "r2" & boot == "Boot" ~ paste0(coef_p, " (", lead(coef_p), ")"), TRUE ~ coef_p)) %>%
  filter(term != "r2_dev") %>%
  pivot_wider(., names_from = var, values_from = coef_p) %>%
  janitor::clean_names()

# Create a list of unique groups in tab_Wide
unique_groups <- unique(tab_Wide[c("controls", "facet", "boot")])

# Initialize an empty data frame to store the combined result
combined_df <- data.frame()

# Iterate over unique groups and append rows_Tab after each group in tab_Wide
for (i in 1:nrow(unique_groups)) {
  group <- unique_groups[i, ]
  group_rows_Tab <- rows_Tab %>%
    filter(controls == group$controls, facet == group$facet, boot == group$boot)
  
  group_tab_Wide <- tab_Wide %>%
    filter(controls == group$controls, facet == group$facet, boot == group$boot)
  
  combined_group <- bind_rows(group_tab_Wide, group_rows_Tab)
  combined_df <- bind_rows(combined_df, combined_group)
}

# Set apart bootstrapped mean levels
meanTab <- combined_df %>%
  filter(facet == "Mean" & boot == "Boot") %>%
  select(!boot & !facet)
# Pivot wider
meanTab <- meanTab %>% 
  pivot_wider(., names_from = controls, values_from = c(
    "ideological_self_placement",
    "social_welfare",
    "race_and_gender",
    "culture_and_morality",
    "immigration"
  )) %>%
  mutate_all(~ replace_na(., "")) 
# Write table mean levels to .csv (we edit te tables in Word)
write.csv(meanTab, file = "gss_meanTab.csv", row.names = FALSE)

# Set apart bootstrapped divergence values
divTab <- combined_df %>%
  filter(facet == "Divergence" & boot == "Boot") %>%
  select(!boot & !facet)
# Pivot wider
divTab <- divTab %>% 
  pivot_wider(., names_from = controls, values_from = c(
    "ideological_self_placement",
    "social_welfare",
    "race_and_gender",
    "culture_and_morality",
    "immigration"
  )) %>%
  mutate_all(~ replace_na(., "")) 
# Write table divergence results to .csv
write.csv(divTab, file = "gss_divTab.csv", row.names = FALSE)

# Set apart bootstrapped sorting
sortTab <- combined_df %>%
  filter(facet == "Sorting" & boot == "Boot") %>%
  select(!boot & !facet)
# Pivot wider
sortTab <- sortTab %>% 
  pivot_wider(., names_from = controls, values_from = c(
    "ideological_self_placement",
    "social_welfare",
    "race_and_gender",
    "culture_and_morality",
    "immigration"
  )) %>%
  mutate_all(~ replace_na(., "")) 
# Write sorting values to .csv
write.csv(sortTab, file = "gss_sortTab.csv", row.names = FALSE)

# Optional: Save work env
save.image(file = "D:dv_rep/Datasets/boot_gss_image.Rdata")